import os
os.getcwd()
#Ideally create different folders for the mons so that you can call only the ones needed for analysis
from PIL import Image
import glob
image_list = []
for filename in glob.glob('/Users/adityakaul/pokemon_data/POKEMON/*.png'): #assuming png
im=Image.open(filename)
image_list.append(im)
len(image_list)
#Show the image at the lcoation 103
#image_list[103]
#Get the pokedex number of the pokemon since they are randomly present in the list
names = glob.glob('/Users/adityakaul/pokemon_data/POKEMON/*.png')
names[0]
pokedex_no = []
for val in names:
pokedex_no.append(val[40:43].lstrip("0"))
len(pokedex_no)
#get the pokedex number at each location in the list
print(pokedex_no[1])
image_list[1]
import pandas as pd
from IPython.display import display
#poke_info.head()
#Get the different features about pokemon from the csv file
poke_info = pd.read_csv("/Users/adityakaul/pokemon_data/pokemon.csv")
pd.options.display.max_columns = None
#display(poke_info)
#display the top 5 rows of data from the dataframe
poke_info.head(5)
non_legendary = poke_info[poke_info['is_legendary'] == 0]
legendary = poke_info[poke_info['is_legendary'] == 1]
legendary_type = legendary.type1.unique().tolist()
non_legendary_type = non_legendary.type1.unique().tolist()
def pokemon_subset(pokemon_type, data):
df = data
pokemon_sub = df[(df['type1'] == pokemon_type) | (df['type2'] == pokemon_type)].drop_duplicates()
pokemon_type_sub = pokemon_sub[['name','pokedex_number','generation','sp_attack','sp_defense','speed','hp','defense','attack']]
#print(tabulate(pokemon_type_sub, headers='keys', tablefmt='psql'))
return(pokemon_type_sub)
def strongest_pokemon(pokemon_type, data):
df = pokemon_subset(pokemon_type, data)
df['total_stats'] = df['sp_attack'] + df['sp_defense'] + df['speed'] + df['hp'] + df['defense'] + df['attack']
df['type'] = pokemon_type
#strongest = df[df['total_stats'] == df['total_stats'].max()]
strongest = df.loc[df['total_stats'].idxmax()].to_frame().transpose()
strongest = strongest[['name','type','generation','sp_attack','sp_defense','speed','hp','defense','attack','pokedex_number']]
print(tabulate(strongest[['name','type','generation','sp_attack','sp_defense','speed','hp','defense','attack']], headers='keys', tablefmt='psql'))
display(image_list[pokedex_no.index(str(int(strongest.pokedex_number)))])
#return(strongest)
for val in legendary_type:
df = strongest_pokemon(val,legendary)
poke_info.columns
for val in non_legendary_type:
df = strongest_pokemon(val,non_legendary)
def generation_mon(gen):
gen_sub = poke_info[(poke_info['generation'] == gen) & (poke_info['is_legendary'] != 1)]
for val in non_legendary_type:
df = strongest_pokemon(val,gen_sub)
return(df)
gen_list = poke_info.generation.unique().tolist()
for val in gen_list:
generation_mon(val)
non_legendary.head()
non_legendary_mod = non_legendary
import warnings
warnings.filterwarnings('ignore')
non_legendary_mod['total_weakness'] = non_legendary_mod['against_bug'] + non_legendary_mod['against_dark'] + \
non_legendary_mod['against_dragon'] + non_legendary_mod['against_electric'] + non_legendary_mod['against_fairy'] + \
non_legendary_mod['against_fight'] + non_legendary_mod['against_fire'] + non_legendary_mod['against_flying'] + \
non_legendary_mod['against_ghost'] + non_legendary_mod['against_grass'] + non_legendary_mod['against_ground'] + \
non_legendary_mod['against_ice'] + non_legendary_mod['against_normal'] + non_legendary_mod['against_poison'] + \
non_legendary_mod['against_psychic'] + non_legendary_mod['against_rock'] + non_legendary_mod['against_steel'] + \
non_legendary_mod['against_water']
weakest_mon = non_legendary_mod.loc[non_legendary_mod['total_weakness'].idxmax()].to_frame().transpose()
weakest_mon = weakest_mon[['name','pokedex_number','generation','classfication','type1','type2','total_weakness']]
print(tabulate(weakest_mon, headers='keys', tablefmt='psql'))
display(image_list[pokedex_no.index(str(int(weakest_mon.pokedex_number)))])
strongest_mon = non_legendary_mod.loc[non_legendary_mod['total_weakness'].idxmin()].to_frame().transpose()
strongest_mon = strongest_mon[['name','pokedex_number','generation','classfication','type1','type2','total_weakness']]
print(tabulate(strongest_mon, headers='keys', tablefmt='psql'))
display(image_list[pokedex_no.index(str(int(strongest_mon.pokedex_number)))])
heaviest = non_legendary_mod.loc[non_legendary_mod['weight_kg'].idxmax()].to_frame().transpose()
heaviest = heaviest[['name','pokedex_number','generation','classfication','type1','type2','weight_kg']]
print(tabulate(heaviest, headers='keys', tablefmt='psql'))
display(image_list[pokedex_no.index(str(int(heaviest.pokedex_number)))])
lightest = non_legendary_mod.loc[non_legendary_mod['weight_kg'].idxmin()].to_frame().transpose()
lightest = lightest[['name','pokedex_number','generation','classfication','type1','type2','weight_kg']]
print(tabulate(lightest, headers='keys', tablefmt='psql'))
display(image_list[pokedex_no.index(str(int(lightest.pokedex_number)))])
tallest = non_legendary_mod.loc[non_legendary_mod['height_m'].idxmax()].to_frame().transpose()
tallest = tallest[['name','pokedex_number','generation','classfication','type1','type2','height_m']]
print(tabulate(tallest, headers='keys', tablefmt='psql'))
display(image_list[pokedex_no.index(str(int(tallest.pokedex_number)))])
shortest = non_legendary_mod.loc[non_legendary_mod['height_m'].idxmin()].to_frame().transpose()
shortest = shortest[['name','pokedex_number','generation','classfication','type1','type2','height_m']]
print(tabulate(shortest, headers='keys', tablefmt='psql'))
display(image_list[pokedex_no.index(str(int(shortest.pokedex_number)))])
non_legendary.head()
non_legendary_mon1 = non_legendary[['type1','name']]
non_legendary_mon2 = non_legendary[['type2','name']]
type1 = non_legendary_mon1.groupby(['type1']).count().reset_index()
type2 = non_legendary_mon2.groupby(['type2']).count().reset_index()
type_mons = type1.merge(type2, left_on='type1', right_on='type2', how='outer')
type_mons['no_of_mons'] = type_mons['name_x'] + type_mons['name_y']
type_mons = type_mons[['type1','no_of_mons']]
type_mons.transpose()
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
type_mons.plot.barh(x='type1', y='no_of_mons', rot=1, color = 'steelblue')
plt.show()
legendary_mon1 = legendary[['type1','name']]
legendary_mon2 = legendary[['type2','name']]
type1 = legendary_mon1.groupby(['type1']).count().reset_index()
type2 = legendary_mon2.groupby(['type2']).count().reset_index()
type_mons = type1.merge(type2, left_on='type1', right_on='type2', how='outer')
type_mons['no_of_mons'] = type_mons['name_x'] + type_mons['name_y']
type_mons = type_mons[['type1','no_of_mons']]
type_mons = type_mons[pd.notnull(type_mons['type1'])]
type_mons.transpose().fillna(0)
type_mons.plot.barh(x='type1', y='no_of_mons', rot=1, color = 'steelblue')
plt.show()
#perform clustering to see if all pokemon are classified based on tyoe properly
#perform some predictions to see if a regulr pokemon can be legendary based on features?
#poke_info['abilities'][0].strip('\"')
#poke_info['abilities'].unique()
#poke_info['capture_rate'].unique()
poke_info.capture_rate.iloc[[773]] = '255'
poke_info['capture_rate'].unique()
poke_info_ml = poke_info[['against_bug', 'against_dark', 'against_dragon',
'against_electric', 'against_fairy', 'against_fight', 'against_fire',
'against_flying', 'against_ghost', 'against_grass', 'against_ground',
'against_ice', 'against_normal', 'against_poison', 'against_psychic',
'against_rock', 'against_steel', 'against_water', 'attack',
'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
'defense', 'experience_growth', 'height_m', 'hp',
'sp_attack', 'sp_defense', 'speed', 'weight_kg',
'is_legendary']]
poke_info_ml.head()
poke_info_ml_clean = poke_info_ml.dropna()
len(poke_info_ml_clean)
data_final_vars=poke_info_ml_clean.columns.values.tolist()
Y=['is_legendary']
X=[i for i in data_final_vars if i not in Y]
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg, 18)
rfe = rfe.fit(poke_info_ml_clean[X], poke_info_ml_clean[Y] )
print(rfe.support_)
print(rfe.ranking_)
poke_info_ml_useful = poke_info[['against_bug', 'against_dark', 'against_dragon',
'against_fight',
'against_grass',
'against_ice',
'attack',
'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
'defense', 'experience_growth', 'hp',
'sp_attack', 'sp_defense', 'speed', 'weight_kg',
'is_legendary']]
poke_info_ml_useful.head()
#poke_info_ml_useful.head()
poke_info_ml_useful.dtypes
poke_info_ml_useful[['against_bug', 'against_dark', 'against_dragon',
'against_fight',
'against_grass',
'against_ice',
'attack',
'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
'defense', 'experience_growth', 'hp',
'sp_attack', 'sp_defense', 'speed', 'weight_kg']] = poke_info_ml_useful[['against_bug', 'against_dark', 'against_dragon',
'against_fight',
'against_grass',
'against_ice',
'attack',
'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
'defense', 'experience_growth', 'hp',
'sp_attack', 'sp_defense', 'speed', 'weight_kg']].apply(pd.to_numeric)
poke_info_ml_useful[['is_legendary']] = poke_info_ml_useful[['is_legendary']].astype('category')
poke_info_ml_useful_vars = poke_info_ml_useful.columns.values.tolist()
Y=['is_legendary']
X=[i for i in poke_info_ml_useful_vars if i not in Y]
poke_info_ml_useful = poke_info_ml_useful.dropna()
len(poke_info_ml_useful)
import statsmodels.api as sm
logit_model=sm.Logit(poke_info_ml_useful[Y],poke_info_ml_useful[X])
result=logit_model.fit()
print(result.summary())
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(poke_info_ml_useful[X], np.ravel(poke_info_ml_useful[Y]), test_size=0.3, random_state=0)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
len(X_train)
len(X_test)
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
tp = confusion_matrix[0][0]
fp = confusion_matrix[0][1]
fn = confusion_matrix[1][0]
tn = confusion_matrix[1][1]
accuracy = (tp + tn)/(tp + tn + fp + fn)
print("Accuracy: " + str(round(accuracy*100,2)) + "%")
precision = (tp)/(tp + fp)
print("Precision: " + str(round(precision*100,2)) + "%")
recall = (tp)/(tp + fn)
print("Recall: " + str(round(recall*100,2)) + "%")
f1_score = (2*precision*recall)/(precision + recall)
print("F1 Score: " + str(round(f1_score*100,2)) + "%")
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
X_test
pd.DataFrame(y_test)
pd.DataFrame(y_pred)
X_test['y_test'] = Series(y_test, index=X_test.index)
X_test['y_pred'] = Series(y_pred, index=X_test.index)
X_test.head()
#find which ones were wrongly classified
wrongly_classified = X_test[(X_test['y_pred'] == 0) & (X_test['y_test'] == 1)].index.values
for index_val in wrongly_classified:
print(tabulate(poke_info[['name','pokedex_number','type1','type2','generation']].iloc[index_val].to_frame().transpose(), headers='keys', tablefmt='psql'))
wrongly_classified_non_leg = X_test[(X_test['y_pred'] == 1) & (X_test['y_test'] == 0)].index.values
for index_val in wrongly_classified_non_leg:
print(tabulate(poke_info[['name','pokedex_number','type1','type2','generation']].iloc[index_val].to_frame().transpose(), headers='keys', tablefmt='psql'))